sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.6.0  magrittr_2.0.1  tools_3.6.0     htmltools_0.5.0
##  [5] yaml_2.2.1      stringi_1.5.3   rmarkdown_2.6   knitr_1.30     
##  [9] stringr_1.4.0   xfun_0.19       digest_0.6.27   rlang_0.4.10   
## [13] evaluate_0.14

Load tidyverse and other packages for this lecture:

library("tidyverse")
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.4     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library("rvest")
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library("quantmod")
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

Web scraping

There is a wealth of data on internet. How to scrape them and analyze them?

rvest

rvest is an R package written by Hadley Wickham which makes web scraping easy.

Example: Scraping from webpage

Rank

  • Use SelectorGadget to highlight the element we want to scrape

  • Use the CSS selector to get the rankings

    # Use CSS selectors to scrap the rankings section
    (rank_data_html <- html_nodes(webpage, '.text-primary'))
    ## {xml_nodeset (100)}
    ##  [1] <span class="lister-item-index unbold text-primary">1.</span>
    ##  [2] <span class="lister-item-index unbold text-primary">2.</span>
    ##  [3] <span class="lister-item-index unbold text-primary">3.</span>
    ##  [4] <span class="lister-item-index unbold text-primary">4.</span>
    ##  [5] <span class="lister-item-index unbold text-primary">5.</span>
    ##  [6] <span class="lister-item-index unbold text-primary">6.</span>
    ##  [7] <span class="lister-item-index unbold text-primary">7.</span>
    ##  [8] <span class="lister-item-index unbold text-primary">8.</span>
    ##  [9] <span class="lister-item-index unbold text-primary">9.</span>
    ## [10] <span class="lister-item-index unbold text-primary">10.</span>
    ## [11] <span class="lister-item-index unbold text-primary">11.</span>
    ## [12] <span class="lister-item-index unbold text-primary">12.</span>
    ## [13] <span class="lister-item-index unbold text-primary">13.</span>
    ## [14] <span class="lister-item-index unbold text-primary">14.</span>
    ## [15] <span class="lister-item-index unbold text-primary">15.</span>
    ## [16] <span class="lister-item-index unbold text-primary">16.</span>
    ## [17] <span class="lister-item-index unbold text-primary">17.</span>
    ## [18] <span class="lister-item-index unbold text-primary">18.</span>
    ## [19] <span class="lister-item-index unbold text-primary">19.</span>
    ## [20] <span class="lister-item-index unbold text-primary">20.</span>
    ## ...
    # Convert the ranking data to text
    (rank_data <- html_text(rank_data_html))
    ##   [1] "1."   "2."   "3."   "4."   "5."   "6."   "7."   "8."   "9."   "10." 
    ##  [11] "11."  "12."  "13."  "14."  "15."  "16."  "17."  "18."  "19."  "20." 
    ##  [21] "21."  "22."  "23."  "24."  "25."  "26."  "27."  "28."  "29."  "30." 
    ##  [31] "31."  "32."  "33."  "34."  "35."  "36."  "37."  "38."  "39."  "40." 
    ##  [41] "41."  "42."  "43."  "44."  "45."  "46."  "47."  "48."  "49."  "50." 
    ##  [51] "51."  "52."  "53."  "54."  "55."  "56."  "57."  "58."  "59."  "60." 
    ##  [61] "61."  "62."  "63."  "64."  "65."  "66."  "67."  "68."  "69."  "70." 
    ##  [71] "71."  "72."  "73."  "74."  "75."  "76."  "77."  "78."  "79."  "80." 
    ##  [81] "81."  "82."  "83."  "84."  "85."  "86."  "87."  "88."  "89."  "90." 
    ##  [91] "91."  "92."  "93."  "94."  "95."  "96."  "97."  "98."  "99."  "100."
    # Turn into numerical values
    (rank_data <- as.integer(rank_data))
    ##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
    ##  [19]  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
    ##  [37]  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
    ##  [55]  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
    ##  [73]  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
    ##  [91]  91  92  93  94  95  96  97  98  99 100

Title

  • Use SelectorGadget to find the CSS selector .lister-item-header a.

    # Using CSS selectors to scrap the title section
    (title_data_html <- html_nodes(webpage, '.lister-item-header a'))
    ## {xml_nodeset (100)}
    ##  [1] <a href="/title/tt3741700/?ref_=adv_li_tt">Godzilla: King of the Monster ...
    ##  [2] <a href="/title/tt9608818/?ref_=adv_li_tt">Our Friend</a>
    ##  [3] <a href="/title/tt4154796/?ref_=adv_li_tt">Avengers: Endgame</a>
    ##  [4] <a href="/title/tt4154664/?ref_=adv_li_tt">Captain Marvel</a>
    ##  [5] <a href="/title/tt8367814/?ref_=adv_li_tt">The Gentlemen</a>
    ##  [6] <a href="/title/tt6751668/?ref_=adv_li_tt">Parasite</a>
    ##  [7] <a href="/title/tt5363618/?ref_=adv_li_tt">Sound of Metal</a>
    ##  [8] <a href="/title/tt7131622/?ref_=adv_li_tt">Once Upon a Time... In Hollyw ...
    ##  [9] <a href="/title/tt9016974/?ref_=adv_li_tt">Synchronic</a>
    ## [10] <a href="/title/tt8946378/?ref_=adv_li_tt">Knives Out</a>
    ## [11] <a href="/title/tt8579674/?ref_=adv_li_tt">1917</a>
    ## [12] <a href="/title/tt6394270/?ref_=adv_li_tt">Bombshell</a>
    ## [13] <a href="/title/tt7286456/?ref_=adv_li_tt">Joker</a>
    ## [14] <a href="/title/tt2527338/?ref_=adv_li_tt">Star Wars: The Rise Of Skywal ...
    ## [15] <a href="/title/tt8772262/?ref_=adv_li_tt">Midsommar</a>
    ## [16] <a href="/title/tt2584384/?ref_=adv_li_tt">Jojo Rabbit</a>
    ## [17] <a href="/title/tt3281548/?ref_=adv_li_tt">Little Women</a>
    ## [18] <a href="/title/tt1950186/?ref_=adv_li_tt">Ford v Ferrari</a>
    ## [19] <a href="/title/tt10195452/?ref_=adv_li_tt">American Skin</a>
    ## [20] <a href="/title/tt1302006/?ref_=adv_li_tt">The Irishman</a>
    ## ...
    # Converting the title data to text
    (title_data <- html_text(title_data_html))
    ##   [1] "Godzilla: King of the Monsters"            
    ##   [2] "Our Friend"                                
    ##   [3] "Avengers: Endgame"                         
    ##   [4] "Captain Marvel"                            
    ##   [5] "The Gentlemen"                             
    ##   [6] "Parasite"                                  
    ##   [7] "Sound of Metal"                            
    ##   [8] "Once Upon a Time... In Hollywood"          
    ##   [9] "Synchronic"                                
    ##  [10] "Knives Out"                                
    ##  [11] "1917"                                      
    ##  [12] "Bombshell"                                 
    ##  [13] "Joker"                                     
    ##  [14] "Star Wars: The Rise Of Skywalker"          
    ##  [15] "Midsommar"                                 
    ##  [16] "Jojo Rabbit"                               
    ##  [17] "Little Women"                              
    ##  [18] "Ford v Ferrari"                            
    ##  [19] "American Skin"                             
    ##  [20] "The Irishman"                              
    ##  [21] "Serenity"                                  
    ##  [22] "The Lighthouse"                            
    ##  [23] "After"                                     
    ##  [24] "Doctor Sleep"                              
    ##  [25] "Rocketman"                                 
    ##  [26] "Uncut Gems"                                
    ##  [27] "Jumanji: The Next Level"                   
    ##  [28] "The Peanut Butter Falcon"                  
    ##  [29] "Spider-Man: Far from Home"                 
    ##  [30] "Richard Jewell"                            
    ##  [31] "The Lion King"                             
    ##  [32] "Dark Waters"                               
    ##  [33] "Terminator: Dark Fate"                     
    ##  [34] "Glass"                                     
    ##  [35] "Don't Let Go"                              
    ##  [36] "The King"                                  
    ##  [37] "The Outpost"                               
    ##  [38] "The Platform"                              
    ##  [39] "6 Underground"                             
    ##  [40] "Alita: Battle Angel"                       
    ##  [41] "Cats"                                      
    ##  [42] "Booksmart"                                 
    ##  [43] "Aladdin"                                   
    ##  [44] "Fast & Furious Presents: Hobbs & Shaw"     
    ##  [45] "Us"                                        
    ##  [46] "Portrait of a Lady on Fire"                
    ##  [47] "John Wick: Chapter 3 - Parabellum"         
    ##  [48] "Escape Room"                               
    ##  [49] "Toy Story 4"                               
    ##  [50] "I See You"                                 
    ##  [51] "Shazam!"                                   
    ##  [52] "Midway"                                    
    ##  [53] "Seberg"                                    
    ##  [54] "The Professor and the Madman"              
    ##  [55] "Hustlers"                                  
    ##  [56] "Ad Astra"                                  
    ##  [57] "It Chapter Two"                            
    ##  [58] "Motherless Brooklyn"                       
    ##  [59] "Ready or Not"                              
    ##  [60] "Downton Abbey"                             
    ##  [61] "X-Men: Dark Phoenix"                       
    ##  [62] "Charlie's Angels"                          
    ##  [63] "Marriage Story"                            
    ##  [64] "Ma"                                        
    ##  [65] "The Assistant"                             
    ##  [66] "Saint Maud"                                
    ##  [67] "Extremely Wicked, Shockingly Evil and Vile"
    ##  [68] "Gemini Man"                                
    ##  [69] "The Informer"                              
    ##  [70] "Babyteeth"                                 
    ##  [71] "The Lodge"                                 
    ##  [72] "Official Secrets"                          
    ##  [73] "Frozen II"                                 
    ##  [74] "El Camino: A Breaking Bad Movie"           
    ##  [75] "The Vast of Night"                         
    ##  [76] "The Good Liar"                             
    ##  [77] "Just Mercy"                                
    ##  [78] "First Cow"                                 
    ##  [79] "Polar"                                     
    ##  [80] "Yesterday"                                 
    ##  [81] "Vivarium"                                  
    ##  [82] "Color Out of Space"                        
    ##  [83] "A Beautiful Day in the Neighborhood"       
    ##  [84] "The Highwaymen"                            
    ##  [85] "The Dead Don't Die"                        
    ##  [86] "Skyfire"                                   
    ##  [87] "Angel Has Fallen"                          
    ##  [88] "Zombieland: Double Tap"                    
    ##  [89] "21 Bridges"                                
    ##  [90] "Pinocchio"                                 
    ##  [91] "Triple Frontier"                           
    ##  [92] "Five Feet Apart"                           
    ##  [93] "The Personal History of David Copperfield" 
    ##  [94] "Crawl"                                     
    ##  [95] "Bad Education"                             
    ##  [96] "Honey Boy"                                 
    ##  [97] "Anna"                                      
    ##  [98] "Fighting with My Family"                   
    ##  [99] "Maleficent: Mistress of Evil"              
    ## [100] "Dreamland"

Description

  • # Using CSS selectors to scrap the description section
    (description_data_html <- html_nodes(webpage, '.ratings-bar+ .text-muted'))
    ## {xml_nodeset (100)}
    ##  [1] <p class="text-muted">\n    The crypto-zoological agency Monarch faces o ...
    ##  [2] <p class="text-muted">\n    After receiving life-altering news, a couple ...
    ##  [3] <p class="text-muted">\n    After the devastating events of <a href="/ti ...
    ##  [4] <p class="text-muted">\n    Carol Danvers becomes one of the universe's  ...
    ##  [5] <p class="text-muted">\n    An American expat tries to sell off his high ...
    ##  [6] <p class="text-muted">\n    Greed and class discrimination threaten the  ...
    ##  [7] <p class="text-muted">\n    A heavy-metal drummer's life is thrown into  ...
    ##  [8] <p class="text-muted">\n    A faded television actor and his stunt doubl ...
    ##  [9] <p class="text-muted">\n    Two New Orleans paramedics' lives are ripped ...
    ## [10] <p class="text-muted">\n    A detective investigates the death of a patr ...
    ## [11] <p class="text-muted">\n    April 6th, 1917. As a regiment assembles to  ...
    ## [12] <p class="text-muted">\n    A group of women take on Fox News head <a hr ...
    ## [13] <p class="text-muted">\n    In Gotham City, mentally troubled comedian A ...
    ## [14] <p class="text-muted">\n    The surviving members of the resistance face ...
    ## [15] <p class="text-muted">\n    A couple travels to Northern Europe to visit ...
    ## [16] <p class="text-muted">\n    A young boy in Hitler's army finds out his m ...
    ## [17] <p class="text-muted">\n    Jo March reflects back and forth on her life ...
    ## [18] <p class="text-muted">\n    American car designer <a href="/name/nm07909 ...
    ## [19] <p class="text-muted">\n    A Marine veteran working as a school janitor ...
    ## [20] <p class="text-muted">\n    An old man recalls his time painting houses  ...
    ## ...
    # Converting the description data to text
    description_data <- html_text(description_data_html)
    # take a look at first few
    head(description_data)
    ## [1] "\n    The crypto-zoological agency Monarch faces off against a battery of god-sized monsters, including the mighty Godzilla, who collides with Mothra, Rodan, and his ultimate nemesis, the three-headed King Ghidorah."                              
    ## [2] "\n    After receiving life-altering news, a couple finds unexpected support from their best friend, who puts his own life on hold and moves into their family home, bringing an impact much greater and more profound than anyone could have imagined"
    ## [3] "\n    After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe."           
    ## [4] "\n    Carol Danvers becomes one of the universe's most powerful heroes when Earth is caught in the middle of a galactic war between two alien races."                                                                                                 
    ## [5] "\n    An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."                                                   
    ## [6] "\n    Greed and class discrimination threaten the newly formed symbiotic relationship between the wealthy Park family and the destitute Kim clan."
    # strip the '\n'
    description_data <- str_replace(description_data, "^\\n\\s+", "")
    head(description_data)
    ## [1] "The crypto-zoological agency Monarch faces off against a battery of god-sized monsters, including the mighty Godzilla, who collides with Mothra, Rodan, and his ultimate nemesis, the three-headed King Ghidorah."                              
    ## [2] "After receiving life-altering news, a couple finds unexpected support from their best friend, who puts his own life on hold and moves into their family home, bringing an impact much greater and more profound than anyone could have imagined"
    ## [3] "After the devastating events of Avengers: Infinity War (2018), the universe is in ruins. With the help of remaining allies, the Avengers assemble once more in order to reverse Thanos' actions and restore balance to the universe."           
    ## [4] "Carol Danvers becomes one of the universe's most powerful heroes when Earth is caught in the middle of a galactic war between two alien races."                                                                                                 
    ## [5] "An American expat tries to sell off his highly profitable marijuana empire in London, triggering plots, schemes, bribery and blackmail in an attempt to steal his domain out from under him."                                                   
    ## [6] "Greed and class discrimination threaten the newly formed symbiotic relationship between the wealthy Park family and the destitute Kim clan."

Runtime

  • Retrieve runtime data
# Using CSS selectors to scrap the Movie runtime section
(runtime_data <- webpage %>%
  html_nodes('.runtime') %>%
  html_text() %>%
  str_replace(" min", "") %>%
  as.integer())
##   [1] 132 124 181 123 113 132 120 161 102 130 119 109 122 141 148 108 135 152
##  [19]  89 209 106 109 105 152 121 135 123  97 129 131 118 126 128 129 103 140
##  [37] 123  94 128 122 110 102 128 137 116 122 130  99 100  98 132 138 102 124
##  [55] 110 123 169 144  95 122 113 118 137  99  87  84 110 117 113 118 108 112
##  [73] 103 122  91 109 137 122 118 116  97 111 109 132 104  97 121  99  99 125
##  [91] 125 116 119  87 108  94 118 108 119  98

Genre

  • Collect the (first) genre of each movie:

    # Using CSS selectors to scrap the Movie genre section
    genre_data_html <- html_nodes(webpage, '.genre')
    # Converting the genre data to text
    genre_data <- html_text(genre_data_html)
    # Let's have a look at the genre data
    head(genre_data)    
    ## [1] "\nAction, Adventure, Fantasy            "
    ## [2] "\nDrama            "                     
    ## [3] "\nAction, Adventure, Drama            "  
    ## [4] "\nAction, Adventure, Sci-Fi            " 
    ## [5] "\nAction, Comedy, Crime            "     
    ## [6] "\nComedy, Drama, Thriller            "
    # Data-Preprocessing: retrieve the first word
    genre_data <- str_extract(genre_data, "[:alpha:]+")
    # Convering each genre from text to factor
    #genre_data <- as.factor(genre_data)
    # Let's have another look at the genre data
    head(genre_data)
    ## [1] "Action" "Drama"  "Action" "Action" "Action" "Comedy"

Rating

  • # Using CSS selectors to scrap the IMDB rating section
    rating_data_html <- html_nodes(webpage, '.ratings-imdb-rating strong')
    # Converting the ratings data to text
    rating_data <- html_text(rating_data_html)
    # Let's have a look at the ratings
    head(rating_data)
    ## [1] "6.0" "7.2" "8.4" "6.9" "7.8" "8.6"
    # Data-Preprocessing: converting ratings to numerical
    rating_data <- as.numeric(rating_data)
    # Let's have another look at the ratings data
    rating_data
    ##   [1] 6.0 7.2 8.4 6.9 7.8 8.6 7.8 7.6 6.2 7.9 8.3 6.8 8.5 6.6 7.1 7.9 7.8 8.1
    ##  [19] 6.0 7.9 5.4 7.5 5.3 7.3 7.3 7.4 6.7 7.6 7.5 7.5 6.9 7.6 6.2 6.7 6.3 7.2
    ##  [37] 6.8 7.0 6.1 7.3 2.8 7.2 6.9 6.4 6.8 8.1 7.4 6.4 7.8 6.8 7.0 6.7 5.7 7.3
    ##  [55] 6.3 6.5 6.5 6.8 6.8 7.4 5.7 4.8 7.9 5.6 6.2 7.0 6.6 5.7 6.6 7.2 6.0 7.3
    ##  [73] 6.9 7.3 6.7 6.6 7.6 7.1 6.3 6.8 5.8 6.2 7.3 6.9 5.5 5.3 6.4 6.7 6.6 6.2
    ##  [91] 6.4 7.2 6.4 6.1 7.1 7.3 6.6 7.1 6.6 5.8

Votes

  • # Using CSS selectors to scrap the votes section
    votes_data_html <- html_nodes(webpage, '.sort-num_votes-visible span:nth-child(2)')
    # Converting the votes data to text
    votes_data <- html_text(votes_data_html)
    # Let's have a look at the votes data
    head(votes_data)
    ## [1] "134,259" "1,437"   "810,893" "442,500" "237,935" "553,703"
    # Data-Preprocessing: removing commas
    votes_data <- str_replace(votes_data, ",", "")
    # Data-Preprocessing: converting votes to numerical
    votes_data <- as.numeric(votes_data)
    #Let's have another look at the votes data
    votes_data
    ##   [1] 134259   1437 810893 442500 237935 553703  27602 551932   7253 454854
    ##  [11] 426526  89623 940168 374453 201112 298298 143417 291635   1867 324980
    ##  [21]  35928 143042  38219 143698 137492 220796 188125  66449 324470  62479
    ##  [31] 212343  60542 149742 208389   6458  89147  19241 164642 139547 228421
    ##  [41]  42303  92950 227804 167173 227434  63258 275709  87394 203359  29972
    ##  [51] 255823  66473   5793  35530  83873 199164 207878  43507 110013  39813
    ##  [61] 155646  57265 246835  39868  14488   3273  79188  86351  20862   9317
    ##  [71]  32133  33797 136991 183797  27700  26412  46813   8686  75564 113311
    ##  [81]  38582  32875  61222  75275  53157    868  79163 139759  50029   6817
    ##  [91] 108048  43694  12874  68014  31549  28073  61334  66632  85184   2405

Director

  • # Using CSS selectors to scrap the directors section
    (directors_data_html <- html_nodes(webpage,'.text-muted+ p a:nth-child(1)'))
    ## {xml_nodeset (100)}
    ##  [1] <a href="/name/nm1002424/?ref_=adv_li_dr_0">Michael Dougherty</a>
    ##  [2] <a href="/name/nm1363250/?ref_=adv_li_dr_0">Gabriela Cowperthwaite</a>
    ##  [3] <a href="/name/nm0751577/?ref_=adv_li_dr_0">Anthony Russo</a>
    ##  [4] <a href="/name/nm1349818/?ref_=adv_li_dr_0">Anna Boden</a>
    ##  [5] <a href="/name/nm0005363/?ref_=adv_li_dr_0">Guy Ritchie</a>
    ##  [6] <a href="/name/nm0094435/?ref_=adv_li_dr_0">Bong Joon Ho</a>
    ##  [7] <a href="/name/nm2942187/?ref_=adv_li_dr_0">Darius Marder</a>
    ##  [8] <a href="/name/nm0000233/?ref_=adv_li_dr_0">Quentin Tarantino</a>
    ##  [9] <a href="/name/nm1918140/?ref_=adv_li_dr_0">Justin Benson</a>
    ## [10] <a href="/name/nm0426059/?ref_=adv_li_dr_0">Rian Johnson</a>
    ## [11] <a href="/name/nm0005222/?ref_=adv_li_dr_0">Sam Mendes</a>
    ## [12] <a href="/name/nm0005366/?ref_=adv_li_dr_0">Jay Roach</a>
    ## [13] <a href="/name/nm0680846/?ref_=adv_li_dr_0">Todd Phillips</a>
    ## [14] <a href="/name/nm0009190/?ref_=adv_li_dr_0">J.J. Abrams</a>
    ## [15] <a href="/name/nm4170048/?ref_=adv_li_dr_0">Ari Aster</a>
    ## [16] <a href="/name/nm0169806/?ref_=adv_li_dr_0">Taika Waititi</a>
    ## [17] <a href="/name/nm1950086/?ref_=adv_li_dr_0">Greta Gerwig</a>
    ## [18] <a href="/name/nm0003506/?ref_=adv_li_dr_0">James Mangold</a>
    ## [19] <a href="/name/nm1676649/?ref_=adv_li_dr_0">Nate Parker</a>
    ## [20] <a href="/name/nm0000217/?ref_=adv_li_dr_0">Martin Scorsese</a>
    ## ...
    # Converting the directors data to text
    directors_data <- html_text(directors_data_html)
    # Let's have a look at the directors data
    directors_data
    ##   [1] "Michael Dougherty"      "Gabriela Cowperthwaite" "Anthony Russo"         
    ##   [4] "Anna Boden"             "Guy Ritchie"            "Bong Joon Ho"          
    ##   [7] "Darius Marder"          "Quentin Tarantino"      "Justin Benson"         
    ##  [10] "Rian Johnson"           "Sam Mendes"             "Jay Roach"             
    ##  [13] "Todd Phillips"          "J.J. Abrams"            "Ari Aster"             
    ##  [16] "Taika Waititi"          "Greta Gerwig"           "James Mangold"         
    ##  [19] "Nate Parker"            "Martin Scorsese"        "Steven Knight"         
    ##  [22] "Robert Eggers"          "Jenny Gage"             "Mike Flanagan"         
    ##  [25] "Dexter Fletcher"        "Benny Safdie"           "Jake Kasdan"           
    ##  [28] "Tyler Nilson"           "Jon Watts"              "Clint Eastwood"        
    ##  [31] "Jon Favreau"            "Todd Haynes"            "Tim Miller"            
    ##  [34] "M. Night Shyamalan"     "Jacob Estes"            "David Michôd"          
    ##  [37] "Rod Lurie"              "Galder Gaztelu-Urrutia" "Michael Bay"           
    ##  [40] "Robert Rodriguez"       "Tom Hooper"             "Olivia Wilde"          
    ##  [43] "Guy Ritchie"            "David Leitch"           "Jordan Peele"          
    ##  [46] "Céline Sciamma"         "Chad Stahelski"         "Adam Robitel"          
    ##  [49] "Josh Cooley"            "Adam Randall"           "David F. Sandberg"     
    ##  [52] "Roland Emmerich"        "Benedict Andrews"       "Farhad Safinia"        
    ##  [55] "Lorene Scafaria"        "James Gray"             "Andy Muschietti"       
    ##  [58] "Edward Norton"          "Matt Bettinelli-Olpin"  "Michael Engler"        
    ##  [61] "Simon Kinberg"          "Elizabeth Banks"        "Noah Baumbach"         
    ##  [64] "Tate Taylor"            "Kitty Green"            "Rose Glass"            
    ##  [67] "Joe Berlinger"          "Ang Lee"                "Andrea Di Stefano"     
    ##  [70] "Shannon Murphy"         "Severin Fiala"          "Gavin Hood"            
    ##  [73] "Chris Buck"             "Vince Gilligan"         "Andrew Patterson"      
    ##  [76] "Bill Condon"            "Destin Daniel Cretton"  "Kelly Reichardt"       
    ##  [79] "Jonas Åkerlund"         "Danny Boyle"            "Lorcan Finnegan"       
    ##  [82] "Richard Stanley"        "Marielle Heller"        "John Lee Hancock"      
    ##  [85] "Jim Jarmusch"           "Simon West"             "Ric Roman Waugh"       
    ##  [88] "Ruben Fleischer"        "Brian Kirk"             "Matteo Garrone"        
    ##  [91] "J.C. Chandor"           "Justin Baldoni"         "Armando Iannucci"      
    ##  [94] "Alexandre Aja"          "Cory Finley"            "Alma Har'el"           
    ##  [97] "Luc Besson"             "Stephen Merchant"       "Joachim Rønning"       
    ## [100] "Miles Joris-Peyrafitte"

Actor

  • # Using CSS selectors to scrap the actors section
    (actors_data_html <- html_nodes(webpage, '.lister-item-content .ghost+ a'))
    ## {xml_nodeset (100)}
    ##  [1] <a href="/name/nm0151419/?ref_=adv_li_st_0">Kyle Chandler</a>
    ##  [2] <a href="/name/nm0781981/?ref_=adv_li_st_0">Jason Segel</a>
    ##  [3] <a href="/name/nm0000375/?ref_=adv_li_st_0">Robert Downey Jr.</a>
    ##  [4] <a href="/name/nm0488953/?ref_=adv_li_st_0">Brie Larson</a>
    ##  [5] <a href="/name/nm0000190/?ref_=adv_li_st_0">Matthew McConaughey</a>
    ##  [6] <a href="/name/nm0814280/?ref_=adv_li_st_0">Kang-ho Song</a>
    ##  [7] <a href="/name/nm1981893/?ref_=adv_li_st_0">Riz Ahmed</a>
    ##  [8] <a href="/name/nm0000138/?ref_=adv_li_st_0">Leonardo DiCaprio</a>
    ##  [9] <a href="/name/nm1107001/?ref_=adv_li_st_0">Anthony Mackie</a>
    ## [10] <a href="/name/nm0185819/?ref_=adv_li_st_0">Daniel Craig</a>
    ## [11] <a href="/name/nm2835616/?ref_=adv_li_st_0">Dean-Charles Chapman</a>
    ## [12] <a href="/name/nm0000234/?ref_=adv_li_st_0">Charlize Theron</a>
    ## [13] <a href="/name/nm0001618/?ref_=adv_li_st_0">Joaquin Phoenix</a>
    ## [14] <a href="/name/nm5397459/?ref_=adv_li_st_0">Daisy Ridley</a>
    ## [15] <a href="/name/nm6073955/?ref_=adv_li_st_0">Florence Pugh</a>
    ## [16] <a href="/name/nm9877392/?ref_=adv_li_st_0">Roman Griffin Davis</a>
    ## [17] <a href="/name/nm1519680/?ref_=adv_li_st_0">Saoirse Ronan</a>
    ## [18] <a href="/name/nm0000354/?ref_=adv_li_st_0">Matt Damon</a>
    ## [19] <a href="/name/nm1165044/?ref_=adv_li_st_0">Omari Hardwick</a>
    ## [20] <a href="/name/nm0000134/?ref_=adv_li_st_0">Robert De Niro</a>
    ## ...
    # Converting the gross actors data to text
    actors_data <- html_text(actors_data_html)
    # Let's have a look at the actors data
    head(actors_data)
    ## [1] "Kyle Chandler"       "Jason Segel"         "Robert Downey Jr."  
    ## [4] "Brie Larson"         "Matthew McConaughey" "Kang-ho Song"

Metascore

  • Be careful with missing data.

    # Using CSS selectors to scrap the metascore section
    metascore_data_html <- html_nodes(webpage, '.metascore')
    # Converting the runtime data to text
    metascore_data <- html_text(metascore_data_html)
    # Let's have a look at the metascore 
    head(metascore_data)
    ## [1] "48        " "57        " "78        " "64        " "51        "
    ## [6] "96        "
    # Data-Preprocessing: removing extra space in metascore
    metascore_data <- str_replace(metascore_data, "\\s*$", "")
    metascore_data <- as.numeric(metascore_data)
    metascore_data
    ##   [1] 48 57 78 64 51 96 81 83 63 82 78 64 59 53 72 58 91 81 24 94 37 83 30 59 69
    ##  [26] 91 58 70 69 68 55 73 54 43 49 62 71 73 41 53 32 84 53 60 81 95 73 48 84 65
    ##  [51] 71 47 54 27 79 80 58 60 64 64 43 52 94 53 79 84 52 38 61 77 64 63 64 72 84
    ##  [76] 55 68 89 19 55 64 70 80 58 53 47 45 55 51 64 61 53 77 60 79 73 40 68 43 57
    # Lets check the length of metascore data
    length(metascore_data)
    ## [1] 100
    # # Visual inspection finds 24, 85, 100 don't have metascore
    # ms <- rep(NA, 100)
    # ms[-c(24, 85, 100)] <- metascore_data
    # (metascore_data <- ms)

Gross

  • Be careful with missing data.

    # Using CSS selectors to scrap the gross revenue section
    gross_data_html <- html_nodes(webpage,'.ghost~ .text-muted+ span')
    # Converting the gross revenue data to text
    gross_data <- html_text(gross_data_html)
    # Let's have a look at the gross data
    head(gross_data)
    ## [1] "$110.50M" "$858.37M" "$426.83M" "$53.37M"  "$142.50M" "$165.36M"
    # Data-Preprocessing: removing '$' and 'M' signs
    gross_data <- str_replace(gross_data, "M", "")
    gross_data <- str_sub(gross_data, 2, 10)
    #(gross_data <- str_extract(gross_data, "[:digit:]+.[:digit:]+"))
    gross_data <- as.numeric(gross_data)
    # Let's check the length of gross data
    length(gross_data)
    ## [1] 56
    # Visual inspection finds below movies don't have gross
    #gs_data <- rep(NA, 100)
    #gs_data[-c(1, 2, 3, 5, 61, 69, 71, 74, 78, 82, 84:87, 90)] <- gross_data
    #(gross_data <- gs_data)

    44 (out of 100) movies don’t have gross data yet! We need a better way to figure out missing entries.

    (rank_and_gross <- webpage %>%
      # retrieve rank and gross
      html_nodes('.ghost~ .text-muted+ span , .text-primary') %>%
      html_text() %>%
      str_replace("\\s+", "") %>%
      str_replace_all("[$M]", ""))
    ##   [1] "1."     "110.50" "2."     "3."     "858.37" "4."     "426.83" "5."    
    ##   [9] "6."     "53.37"  "7."     "8."     "142.50" "9."     "10."    "165.36"
    ##  [17] "11."    "159.23" "12."    "13."    "335.45" "14."    "515.20" "15."   
    ##  [25] "27.33"  "16."    "0.35"   "17."    "108.10" "18."    "117.62" "19."   
    ##  [33] "20."    "7.00"   "21."    "8.55"   "22."    "0.43"   "23."    "12.14" 
    ##  [41] "24."    "25."    "96.37"  "26."    "27."    "316.83" "28."    "13.12" 
    ##  [49] "29."    "390.53" "30."    "31."    "543.64" "32."    "33."    "62.25" 
    ##  [57] "34."    "111.05" "35."    "4.69"   "36."    "37."    "38."    "39."   
    ##  [65] "40."    "85.71"  "41."    "42."    "22.68"  "43."    "355.56" "44."   
    ##  [73] "173.96" "45."    "175.08" "46."    "3.76"   "47."    "171.02" "48."   
    ##  [81] "57.01"  "49."    "434.04" "50."    "51."    "140.37" "52."    "53."   
    ##  [89] "54."    "55."    "80.55"  "56."    "35.40"  "57."    "211.59" "58."   
    ##  [97] "59."    "26.74"  "60."    "96.85"  "61."    "65.85"  "62."    "63."   
    ## [105] "2.00"   "64."    "45.37"  "65."    "66."    "67."    "68."    "20.55" 
    ## [113] "69."    "70."    "71."    "72."    "0.40"   "73."    "477.37" "74."   
    ## [121] "75."    "76."    "77."    "78."    "79."    "80."    "73.29"  "81."   
    ## [129] "82."    "83."    "61.70"  "84."    "85."    "6.56"   "86."    "87."   
    ## [137] "67.16"  "88."    "26.80"  "89."    "90."    "91."    "92."    "45.73" 
    ## [145] "93."    "94."    "39.01"  "95."    "96."    "97."    "7.74"   "98."   
    ## [153] "22.96"  "99."    "113.93" "100."
    isrank <- str_detect(rank_and_gross, "\\.$")
    ismissing <- isrank[1:(length(rank_and_gross) - 1)] & isrank[2:(length(rank_and_gross))]
    ismissing[length(ismissing)+1] <- isrank[length(isrank)]
    missingpos <- as.integer(rank_and_gross[ismissing])
    gs_data <- rep(NA, 100)
    gs_data[-missingpos] <- gross_data
    (gross_data <- gs_data)
    ##   [1] 110.50     NA 858.37 426.83     NA  53.37     NA 142.50     NA 165.36
    ##  [11] 159.23     NA 335.45 515.20  27.33   0.35 108.10 117.62     NA   7.00
    ##  [21]   8.55   0.43  12.14     NA  96.37     NA 316.83  13.12 390.53     NA
    ##  [31] 543.64     NA  62.25 111.05   4.69     NA     NA     NA     NA  85.71
    ##  [41]     NA  22.68 355.56 173.96 175.08   3.76 171.02  57.01 434.04     NA
    ##  [51] 140.37     NA     NA     NA  80.55  35.40 211.59     NA  26.74  96.85
    ##  [61]  65.85     NA   2.00  45.37     NA     NA     NA  20.55     NA     NA
    ##  [71]     NA   0.40 477.37     NA     NA     NA     NA     NA     NA  73.29
    ##  [81]     NA     NA  61.70     NA   6.56     NA  67.16  26.80     NA     NA
    ##  [91]     NA  45.73     NA  39.01     NA     NA   7.74  22.96 113.93     NA

Missing entries - more reproducible way

  • Following code programatically figures out missing entries for metascore.

    # Use CSS selectors to scrap the rankings section
    (rank_metascore_data_html <- html_nodes(webpage, '.unfavorable , .favorable , .mixed , .text-primary'))
    ## {xml_nodeset (200)}
    ##  [1] <span class="lister-item-index unbold text-primary">1.</span>
    ##  [2] <span class="metascore  mixed">48        </span>
    ##  [3] <span class="lister-item-index unbold text-primary">2.</span>
    ##  [4] <span class="metascore  mixed">57        </span>
    ##  [5] <span class="lister-item-index unbold text-primary">3.</span>
    ##  [6] <span class="metascore  favorable">78        </span>
    ##  [7] <span class="lister-item-index unbold text-primary">4.</span>
    ##  [8] <span class="metascore  favorable">64        </span>
    ##  [9] <span class="lister-item-index unbold text-primary">5.</span>
    ## [10] <span class="metascore  mixed">51        </span>
    ## [11] <span class="lister-item-index unbold text-primary">6.</span>
    ## [12] <span class="metascore  favorable">96        </span>
    ## [13] <span class="lister-item-index unbold text-primary">7.</span>
    ## [14] <span class="metascore  favorable">81        </span>
    ## [15] <span class="lister-item-index unbold text-primary">8.</span>
    ## [16] <span class="metascore  favorable">83        </span>
    ## [17] <span class="lister-item-index unbold text-primary">9.</span>
    ## [18] <span class="metascore  favorable">63        </span>
    ## [19] <span class="lister-item-index unbold text-primary">10.</span>
    ## [20] <span class="metascore  favorable">82        </span>
    ## ...
    # Convert the ranking data to text
    (rank_metascore_data <- html_text(rank_metascore_data_html))
    ##   [1] "1."         "48        " "2."         "57        " "3."        
    ##   [6] "78        " "4."         "64        " "5."         "51        "
    ##  [11] "6."         "96        " "7."         "81        " "8."        
    ##  [16] "83        " "9."         "63        " "10."        "82        "
    ##  [21] "11."        "78        " "12."        "64        " "13."       
    ##  [26] "59        " "14."        "53        " "15."        "72        "
    ##  [31] "16."        "58        " "17."        "91        " "18."       
    ##  [36] "81        " "19."        "24        " "20."        "94        "
    ##  [41] "21."        "37        " "22."        "83        " "23."       
    ##  [46] "30        " "24."        "59        " "25."        "69        "
    ##  [51] "26."        "91        " "27."        "58        " "28."       
    ##  [56] "70        " "29."        "69        " "30."        "68        "
    ##  [61] "31."        "55        " "32."        "73        " "33."       
    ##  [66] "54        " "34."        "43        " "35."        "49        "
    ##  [71] "36."        "62        " "37."        "71        " "38."       
    ##  [76] "73        " "39."        "41        " "40."        "53        "
    ##  [81] "41."        "32        " "42."        "84        " "43."       
    ##  [86] "53        " "44."        "60        " "45."        "81        "
    ##  [91] "46."        "95        " "47."        "73        " "48."       
    ##  [96] "48        " "49."        "84        " "50."        "65        "
    ## [101] "51."        "71        " "52."        "47        " "53."       
    ## [106] "54        " "54."        "27        " "55."        "79        "
    ## [111] "56."        "80        " "57."        "58        " "58."       
    ## [116] "60        " "59."        "64        " "60."        "64        "
    ## [121] "61."        "43        " "62."        "52        " "63."       
    ## [126] "94        " "64."        "53        " "65."        "79        "
    ## [131] "66."        "84        " "67."        "52        " "68."       
    ## [136] "38        " "69."        "61        " "70."        "77        "
    ## [141] "71."        "64        " "72."        "63        " "73."       
    ## [146] "64        " "74."        "72        " "75."        "84        "
    ## [151] "76."        "55        " "77."        "68        " "78."       
    ## [156] "89        " "79."        "19        " "80."        "55        "
    ## [161] "81."        "64        " "82."        "70        " "83."       
    ## [166] "80        " "84."        "58        " "85."        "53        "
    ## [171] "86."        "47        " "87."        "45        " "88."       
    ## [176] "55        " "89."        "51        " "90."        "64        "
    ## [181] "91."        "61        " "92."        "53        " "93."       
    ## [186] "77        " "94."        "60        " "95."        "79        "
    ## [191] "96."        "73        " "97."        "40        " "98."       
    ## [196] "68        " "99."        "43        " "100."       "57        "
    # Strip spaces
    (rank_metascore_data <- str_replace(rank_metascore_data, "\\s+", ""))
    ##   [1] "1."   "48"   "2."   "57"   "3."   "78"   "4."   "64"   "5."   "51"  
    ##  [11] "6."   "96"   "7."   "81"   "8."   "83"   "9."   "63"   "10."  "82"  
    ##  [21] "11."  "78"   "12."  "64"   "13."  "59"   "14."  "53"   "15."  "72"  
    ##  [31] "16."  "58"   "17."  "91"   "18."  "81"   "19."  "24"   "20."  "94"  
    ##  [41] "21."  "37"   "22."  "83"   "23."  "30"   "24."  "59"   "25."  "69"  
    ##  [51] "26."  "91"   "27."  "58"   "28."  "70"   "29."  "69"   "30."  "68"  
    ##  [61] "31."  "55"   "32."  "73"   "33."  "54"   "34."  "43"   "35."  "49"  
    ##  [71] "36."  "62"   "37."  "71"   "38."  "73"   "39."  "41"   "40."  "53"  
    ##  [81] "41."  "32"   "42."  "84"   "43."  "53"   "44."  "60"   "45."  "81"  
    ##  [91] "46."  "95"   "47."  "73"   "48."  "48"   "49."  "84"   "50."  "65"  
    ## [101] "51."  "71"   "52."  "47"   "53."  "54"   "54."  "27"   "55."  "79"  
    ## [111] "56."  "80"   "57."  "58"   "58."  "60"   "59."  "64"   "60."  "64"  
    ## [121] "61."  "43"   "62."  "52"   "63."  "94"   "64."  "53"   "65."  "79"  
    ## [131] "66."  "84"   "67."  "52"   "68."  "38"   "69."  "61"   "70."  "77"  
    ## [141] "71."  "64"   "72."  "63"   "73."  "64"   "74."  "72"   "75."  "84"  
    ## [151] "76."  "55"   "77."  "68"   "78."  "89"   "79."  "19"   "80."  "55"  
    ## [161] "81."  "64"   "82."  "70"   "83."  "80"   "84."  "58"   "85."  "53"  
    ## [171] "86."  "47"   "87."  "45"   "88."  "55"   "89."  "51"   "90."  "64"  
    ## [181] "91."  "61"   "92."  "53"   "93."  "77"   "94."  "60"   "95."  "79"  
    ## [191] "96."  "73"   "97."  "40"   "98."  "68"   "99."  "43"   "100." "57"
    # a rank followed by another rank means the metascore for the 1st rank is missing
    (isrank <- str_detect(rank_metascore_data, "\\.$"))
    ##   [1]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ##  [13]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ##  [25]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ##  [37]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ##  [49]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ##  [61]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ##  [73]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ##  [85]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ##  [97]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ## [109]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ## [121]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ## [133]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ## [145]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ## [157]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ## [169]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ## [181]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ## [193]  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE FALSE
    ismissing <- isrank[1:length(rank_metascore_data)-1] & 
      isrank[2:length(rank_metascore_data)]
    ismissing[length(ismissing)+1] <- isrank[length(isrank)]
    (missingpos <- as.integer(rank_metascore_data[ismissing]))
    ## integer(0)
    #(rank_metascore_data <- as.integer(rank_metascore_data))
  • You (students) should work out the code for finding missing positions for gross.

Visualizing movie data

  • Form a tibble:

    # Combining all the lists to form a data frame
    movies <- tibble(Rank = rank_data, 
                     Title = title_data,
                     Description = description_data, 
                     Runtime = runtime_data,
                     Genre = genre_data, 
                     Rating = rating_data,
                     Metascore = metascore_data, 
                     Votes = votes_data,
                     Gross_Earning_in_Mil = gross_data,
                     Director = directors_data, 
                     Actor = actors_data)
    movies %>% print(width=Inf)
    ## # A tibble: 100 x 11
    ##     Rank Title                           
    ##    <int> <chr>                           
    ##  1     1 Godzilla: King of the Monsters  
    ##  2     2 Our Friend                      
    ##  3     3 Avengers: Endgame               
    ##  4     4 Captain Marvel                  
    ##  5     5 The Gentlemen                   
    ##  6     6 Parasite                        
    ##  7     7 Sound of Metal                  
    ##  8     8 Once Upon a Time... In Hollywood
    ##  9     9 Synchronic                      
    ## 10    10 Knives Out                      
    ##    Description                                                                  
    ##    <chr>                                                                        
    ##  1 The crypto-zoological agency Monarch faces off against a battery of god-size…
    ##  2 After receiving life-altering news, a couple finds unexpected support from t…
    ##  3 After the devastating events of Avengers: Infinity War (2018), the universe …
    ##  4 Carol Danvers becomes one of the universe's most powerful heroes when Earth …
    ##  5 An American expat tries to sell off his highly profitable marijuana empire i…
    ##  6 Greed and class discrimination threaten the newly formed symbiotic relations…
    ##  7 A heavy-metal drummer's life is thrown into freefall when he begins to lose …
    ##  8 A faded television actor and his stunt double strive to achieve fame and suc…
    ##  9 Two New Orleans paramedics' lives are ripped apart after they encounter a se…
    ## 10 A detective investigates the death of a patriarch of an eccentric, combative…
    ##    Runtime Genre  Rating Metascore  Votes Gross_Earning_in_Mil
    ##      <int> <chr>   <dbl>     <dbl>  <dbl>                <dbl>
    ##  1     132 Action    6          48 134259                110. 
    ##  2     124 Drama     7.2        57   1437                 NA  
    ##  3     181 Action    8.4        78 810893                858. 
    ##  4     123 Action    6.9        64 442500                427. 
    ##  5     113 Action    7.8        51 237935                 NA  
    ##  6     132 Comedy    8.6        96 553703                 53.4
    ##  7     120 Drama     7.8        81  27602                 NA  
    ##  8     161 Comedy    7.6        83 551932                142. 
    ##  9     102 Drama     6.2        63   7253                 NA  
    ## 10     130 Comedy    7.9        82 454854                165. 
    ##    Director               Actor              
    ##    <chr>                  <chr>              
    ##  1 Michael Dougherty      Kyle Chandler      
    ##  2 Gabriela Cowperthwaite Jason Segel        
    ##  3 Anthony Russo          Robert Downey Jr.  
    ##  4 Anna Boden             Brie Larson        
    ##  5 Guy Ritchie            Matthew McConaughey
    ##  6 Bong Joon Ho           Kang-ho Song       
    ##  7 Darius Marder          Riz Ahmed          
    ##  8 Quentin Tarantino      Leonardo DiCaprio  
    ##  9 Justin Benson          Anthony Mackie     
    ## 10 Rian Johnson           Daniel Craig       
    ## # … with 90 more rows
  • How many top 100 movies are in each genre? (Be careful with interpretation.)

    movies %>%
      ggplot() +
      geom_bar(mapping = aes(x = Genre))

  • Which genre is most profitable in terms of average gross earnings?

    movies %>%
      group_by(Genre) %>%
      summarise(avg_earning = mean(Gross_Earning_in_Mil, na.rm=TRUE)) %>%
      ggplot() +
        geom_col(mapping = aes(x = Genre, y = avg_earning)) + 
        labs(y = "avg earning in millions")
    ## `summarise()` ungrouping output (override with `.groups` argument)

    ggplot(data = movies) +
      geom_boxplot(mapping = aes(x = Genre, y = Gross_Earning_in_Mil)) + 
      labs(y = "Gross earning in millions")
    ## Warning: Removed 44 rows containing non-finite values (stat_boxplot).

  • Is there a relationship between gross earning and rating? Find the best selling movie (by gross earning) in each genre

    library("ggrepel")
    (best_in_genre <- movies %>%
        group_by(Genre) %>%
        filter(row_number(desc(Gross_Earning_in_Mil)) == 1)) %>%
        print(width = Inf)
    ## # A tibble: 8 x 11
    ## # Groups:   Genre [8]
    ##    Rank Title            
    ##   <int> <chr>            
    ## 1     3 Avengers: Endgame
    ## 2    10 Knives Out       
    ## 3    13 Joker            
    ## 4    25 Rocketman        
    ## 5    31 The Lion King    
    ## 6    43 Aladdin          
    ## 7    45 Us               
    ## 8    57 It Chapter Two   
    ##   Description                                                                   
    ##   <chr>                                                                         
    ## 1 After the devastating events of Avengers: Infinity War (2018), the universe i…
    ## 2 A detective investigates the death of a patriarch of an eccentric, combative …
    ## 3 In Gotham City, mentally troubled comedian Arthur Fleck is disregarded and mi…
    ## 4 A musical fantasy about the fantastical human story of Elton John's breakthro…
    ## 5 After the murder of his father, a young lion prince flees his kingdom only to…
    ## 6 A kind-hearted street urchin and a power-hungry Grand Vizier vie for a magic …
    ## 7 A family's serene beach vacation turns to chaos when their doppelgängers appe…
    ## 8 Twenty-seven years after their first encounter with the terrifying Pennywise,…
    ##   Runtime Genre     Rating Metascore  Votes Gross_Earning_in_Mil Director       
    ##     <int> <chr>      <dbl>     <dbl>  <dbl>                <dbl> <chr>          
    ## 1     181 Action       8.4        78 810893                858.  Anthony Russo  
    ## 2     130 Comedy       7.9        82 454854                165.  Rian Johnson   
    ## 3     122 Crime        8.5        59 940168                335.  Todd Phillips  
    ## 4     121 Biography    7.3        69 137492                 96.4 Dexter Fletcher
    ## 5     118 Animation    6.9        55 212343                544.  Jon Favreau    
    ## 6     128 Adventure    6.9        53 227804                356.  Guy Ritchie    
    ## 7     116 Horror       6.8        81 227434                175.  Jordan Peele   
    ## 8     169 Drama        6.5        58 207878                212.  Andy Muschietti
    ##   Actor            
    ##   <chr>            
    ## 1 Robert Downey Jr.
    ## 2 Daniel Craig     
    ## 3 Joaquin Phoenix  
    ## 4 Taron Egerton    
    ## 5 Donald Glover    
    ## 6 Will Smith       
    ## 7 Lupita Nyong'o   
    ## 8 Jessica Chastain
    ggplot(movies, mapping = aes(x = Rating, y = Gross_Earning_in_Mil)) +
      geom_point(mapping = aes(size = Votes, color = Genre)) + 
      ggrepel::geom_label_repel(aes(label = Title), data = best_in_genre) +
      labs(y = "Gross earning in millions")
    ## Warning: Removed 44 rows containing missing values (geom_point).

Example: Scraping image data from Google

Complete search operators are described at http://www.googleguide.com/advanced_operators_reference.html.

searchTerm <- "ucla"
# tbm=isch (images), app (apps), bks (books), nws (news), pts (patents), vid (videos)
# tbs=isz:m (medium images)
# <https://stenevang.wordpress.com/2013/02/22/google-advanced-power-search-url-request-parameters/>
(url <- str_c("https://www.google.com/search?q=", searchTerm,
              "&source=lnms&tbm=isch&sa=X&tbs=isz:m"))
## [1] "https://www.google.com/search?q=ucla&source=lnms&tbm=isch&sa=X&tbs=isz:m"
webpage <- read_html(url)
(imageurl <- webpage %>% html_nodes("img") %>% html_attr("src"))
##  [1] "/images/branding/searchlogo/1x/googlelogo_desk_heirloom_color_150x55dp.gif"                                           
##  [2] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRr43DEHP5AZ2iCPRJHr1td2vvdkmRt9WEyykQetMxrpt1Ds9ivm-IS6mFsCiM&s"
##  [3] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQyrIZegIRkoE5V3QnCJGh6pEuyfgdYihBK-k2quXp-dEtF1KemJ0IqDt6UHg&s" 
##  [4] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRQGvnKLi5K6Rr9jfI46kb4iA1xLuNERddhhnnKD8nJu4MON8T6Zb8ZDVfvhA&s" 
##  [5] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTaVkTW1k3T5bzQKpciNTlhP5PDka4AN_yc2Uk8bcPwTxajHqYbUMhxPwrDzRo&s"
##  [6] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRBK0MZtZomk2WJwEJtY_S9x3EcH0581QtH8dnCSO_KKEpxbesrlEKObgp_Vg&s" 
##  [7] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSn6k2Hlyo-iY5dmXB1EQ71ST6WfK3UxBjpkhFPPMuHYUtwxFeeRWM3QJGIkQ&s" 
##  [8] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRkS5eEmUyFj5pBprZ6QmUYNoQQKgcywqFhmpq9bDUpzIjtrvUPluedsWkUiA&s" 
##  [9] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSR20UMSvExwdx7ujozIKQ_DGVBPcIFeVqAPm64gxu3ZXhWO23eBsziladgKow&s"
## [10] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTZldEPmYFa9BiX5wq2V4A_mKOvQnKQ97EjF8p4u54xEWJisPtyvgRP1zIccsE&s"
## [11] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTRkspJbtVbUCbAgiZ6ivBjLN6Tvs5bXYT-mDqzJNnVr-CYyUg4JAwWXhPswQ&s" 
## [12] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTU-jfk6d9kvZoksHUem84EmfVshYE2A-9m5KBhx0xWLg7_7-t2Cj8QOUEQVw&s" 
## [13] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSCWSrFSIuW9ntgORAGHbio2VOE2neGgrvGv9WcynfOgR9lTgiW38SvZZfnHg&s" 
## [14] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQVSwgqb3jo1nQzxzQl--DP2blOypnGQW_mA7h3tg2xvEn7zSUgsK9r4lJh-A&s" 
## [15] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQpPPiegdL993rc-VddIcaG9AW9EiZfxG5F9Sf5QIa_UZJabsvwoszpJxTtTQ&s" 
## [16] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRUl2I1VUjg6Ehs5hQ_m3Wju45-uQoUDAG1n6FIE2Q3if6WnJvgIRjIyZNtyek&s"
## [17] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRVMK_S62VHTFrT-XleoFzC7j-Ur8h0GHF1ZBpZHwwbSftLh7bW45WcImS-7a4&s"
## [18] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTOtf90v5Yzyq1ctQMrjDGFFLBEG84Z0GeJX7FaaAf3BDkuL48ZYkVLoRzrdA&s" 
## [19] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSNnkY1HtCwX_QxsWz0N6oWagmQFTUrhTsvIW1xdYGiBVCbuyOGGfl9kHDMy6c&s"
## [20] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR-kpMQH1LPRgq-qzh7PZRlCqUtK226PYB3xHz7ZHOoluN211UZ8ztirjjO80c&s"
## [21] "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTNbZzMXPJMIVkIJp1hMHSNlI-kLKFqC1mhoHRaWVviOf4rwaorn4kxYVXms_A&s"

Following code still not working…

downloadImages <- function(files, brand, outPath="images"){
  for(i in 1:length(files)){
    download.file(files[i],
                  destfile = paste0(outPath, "/", brand, "_", i, ".jpg"),
                  mode = 'wb')
  }
}
downloadImages(imageurl, "ucla")
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## URL /images/branding/searchlogo/1x/googlelogo_desk_heirloom_color_150x55dp.gif:
## cannot open destfile 'images/ucla_1.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath,
## "/", brand, : URL https://encrypted-tbn0.gstatic.com/images?
## q=tbn:ANd9GcRr43DEHP5AZ2iCPRJHr1td2vvdkmRt9WEyykQetMxrpt1Ds9ivm-IS6mFsCiM&s:
## cannot open destfile 'images/ucla_2.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath,
## "/", brand, : URL https://encrypted-tbn0.gstatic.com/images?
## q=tbn:ANd9GcQyrIZegIRkoE5V3QnCJGh6pEuyfgdYihBK-k2quXp-dEtF1KemJ0IqDt6UHg&s:
## cannot open destfile 'images/ucla_3.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath,
## "/", brand, : URL https://encrypted-tbn0.gstatic.com/images?
## q=tbn:ANd9GcRQGvnKLi5K6Rr9jfI46kb4iA1xLuNERddhhnnKD8nJu4MON8T6Zb8ZDVfvhA&s:
## cannot open destfile 'images/ucla_4.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath,
## "/", brand, : URL https://encrypted-tbn0.gstatic.com/images?
## q=tbn:ANd9GcTaVkTW1k3T5bzQKpciNTlhP5PDka4AN_yc2Uk8bcPwTxajHqYbUMhxPwrDzRo&s:
## cannot open destfile 'images/ucla_5.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath,
## "/", brand, : URL https://encrypted-tbn0.gstatic.com/images?
## q=tbn:ANd9GcRBK0MZtZomk2WJwEJtY_S9x3EcH0581QtH8dnCSO_KKEpxbesrlEKObgp_Vg&s:
## cannot open destfile 'images/ucla_6.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## URL https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSn6k2Hlyo-
## iY5dmXB1EQ71ST6WfK3UxBjpkhFPPMuHYUtwxFeeRWM3QJGIkQ&s: cannot open destfile
## 'images/ucla_7.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath,
## "/", brand, : URL https://encrypted-tbn0.gstatic.com/images?
## q=tbn:ANd9GcRkS5eEmUyFj5pBprZ6QmUYNoQQKgcywqFhmpq9bDUpzIjtrvUPluedsWkUiA&s:
## cannot open destfile 'images/ucla_8.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath,
## "/", brand, : URL https://encrypted-tbn0.gstatic.com/images?
## q=tbn:ANd9GcSR20UMSvExwdx7ujozIKQ_DGVBPcIFeVqAPm64gxu3ZXhWO23eBsziladgKow&s:
## cannot open destfile 'images/ucla_9.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath,
## "/", brand, : URL https://encrypted-tbn0.gstatic.com/images?
## q=tbn:ANd9GcTZldEPmYFa9BiX5wq2V4A_mKOvQnKQ97EjF8p4u54xEWJisPtyvgRP1zIccsE&s:
## cannot open destfile 'images/ucla_10.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath,
## "/", brand, : URL https://encrypted-tbn0.gstatic.com/images?
## q=tbn:ANd9GcTRkspJbtVbUCbAgiZ6ivBjLN6Tvs5bXYT-mDqzJNnVr-CYyUg4JAwWXhPswQ&s:
## cannot open destfile 'images/ucla_11.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath, "/",
## brand, : URL https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTU-
## jfk6d9kvZoksHUem84EmfVshYE2A-9m5KBhx0xWLg7_7-t2Cj8QOUEQVw&s: cannot open
## destfile 'images/ucla_12.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath,
## "/", brand, : URL https://encrypted-tbn0.gstatic.com/images?
## q=tbn:ANd9GcSCWSrFSIuW9ntgORAGHbio2VOE2neGgrvGv9WcynfOgR9lTgiW38SvZZfnHg&s:
## cannot open destfile 'images/ucla_13.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## URL https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQVSwgqb3jo1nQzxzQl--
## DP2blOypnGQW_mA7h3tg2xvEn7zSUgsK9r4lJh-A&s: cannot open destfile 'images/
## ucla_14.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## URL https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQpPPiegdL993rc-
## VddIcaG9AW9EiZfxG5F9Sf5QIa_UZJabsvwoszpJxTtTQ&s: cannot open destfile 'images/
## ucla_15.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, : URL
## https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRUl2I1VUjg6Ehs5hQ_m3Wju45-
## uQoUDAG1n6FIE2Q3if6WnJvgIRjIyZNtyek&s: cannot open destfile 'images/
## ucla_16.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## URL https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRVMK_S62VHTFrT-
## XleoFzC7j-Ur8h0GHF1ZBpZHwwbSftLh7bW45WcImS-7a4&s: cannot open destfile 'images/
## ucla_17.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath,
## "/", brand, : URL https://encrypted-tbn0.gstatic.com/images?
## q=tbn:ANd9GcTOtf90v5Yzyq1ctQMrjDGFFLBEG84Z0GeJX7FaaAf3BDkuL48ZYkVLoRzrdA&s:
## cannot open destfile 'images/ucla_18.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath,
## "/", brand, : URL https://encrypted-tbn0.gstatic.com/images?
## q=tbn:ANd9GcSNnkY1HtCwX_QxsWz0N6oWagmQFTUrhTsvIW1xdYGiBVCbuyOGGfl9kHDMy6c&s:
## cannot open destfile 'images/ucla_19.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## URL https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR-kpMQH1LPRgq-
## qzh7PZRlCqUtK226PYB3xHz7ZHOoluN211UZ8ztirjjO80c&s: cannot open destfile 'images/
## ucla_20.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, : URL
## https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTNbZzMXPJMIVkIJp1hMHSNlI-
## kLKFqC1mhoHRaWVviOf4rwaorn4kxYVXms_A&s: cannot open destfile 'images/
## ucla_21.jpg', reason 'No such file or directory'
## Warning in download.file(files[i], destfile = paste0(outPath, "/", brand, :
## download had nonzero exit status
ls images/

Example: Scraping finance data

Example: Pull tweets into R

library(twitteR) #load package
consumer_key <- 'XXXXXXXXXX'
consumer_secret <- 'XXXXXXXXXX'
access_token <- 'XXXXXXXXXX'
access_secret <- 'XXXXXXXXXX'
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
virus <- searchTwitter('#China + #Coronavirus', 
                       n = 1000, 
                       since = '2020-01-01', 
                       retryOnRateLimit = 1e3)
virus_df <- as_tibble(twListToDF(virus))
virus_df %>% print(width = Inf)